{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import arff\n",
    "import numpy as np\n",
    "import json\n",
    "from sklearn.model_selection import train_test_split, KFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = arff.load(open('dataset.arff', 'r'))\n",
    "data = np.array(dataset['data'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The dataset has 11055 datapoints with 30 features\n",
      "Features: ['having_IP_Address', 'URL_Length', 'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH', 'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page', 'Statistical_report', 'Result']\n"
     ]
    }
   ],
   "source": [
    "print('The dataset has {0} datapoints with {1} features'.format(data.shape[0], data.shape[1]-1))\n",
    "print('Features: {0}'.format([feature[0] for feature in dataset['attributes']]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data[:, [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 22, 30]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Before spliting\n",
      "X:(11055, 17), y:(11055,)\n",
      "After spliting\n",
      "X_train:(7738, 17), y_train:(7738,), X_test:(3317, 17), y_test:(3317,)\n"
     ]
    }
   ],
   "source": [
    "X, y = data[:, :-1], data[:, -1]\n",
    "y.reshape(y.shape[0])\n",
    "print('Before spliting')\n",
    "print('X:{0}, y:{1}'.format(X.shape, y.shape))\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
    "print('After spliting')\n",
    "print('X_train:{0}, y_train:{1}, X_test:{2}, y_test:{3}'.format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved!\n"
     ]
    }
   ],
   "source": [
    "np.save('X_train.npy', X_train)\n",
    "np.save('X_test.npy', X_test)\n",
    "np.save('y_train.npy', y_train)\n",
    "np.save('y_test.npy', y_test)\n",
    "print('Saved!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test Data written to testdata.json\n"
     ]
    }
   ],
   "source": [
    "test_data = dict()\n",
    "test_data['X_test'] = X_test.tolist()\n",
    "test_data['y_test'] = y_test.tolist()\n",
    "with open('../../static/testdata.json', 'w') as tdfile:\n",
    "    json.dump(test_data, tdfile)\n",
    "    print('Test Data written to testdata.json')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
